HW 03

Author

Joey Garcia

if (!require("pacman")) 
  install.packages("pacman")
Loading required package: pacman
pacman::p_load(tidyverse, 
               lubridate, 
               glue, 
               scales, 
               dplyr, 
               ggthemes, # (above) necessary libs
               RColorBrewer, # Divergence color pallete
               forcats,  # reorder function
               ggimage,  # change background with image
               janitor  # clean data
               )       

# set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

# set width of code output
options(width = 65)

# set figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 7,        # 7" width
  fig.asp = 0.618,      # the golden ratio
  fig.retina = 3,       # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300             # higher dpi, sharper image
)

1 - Du Bois challenge.

income <- read.csv("data/income.csv")

# Add "average_income" to Class Labels
income <- income |>
  mutate(
    Other = replace_na(Other, 0),
    income_label = paste0("$", format(Average_Income, big.mark = ",")),
    ClassLabel = paste0(Class, " | ", income_label),
    ClassLabel = factor(ClassLabel, levels = unique(ClassLabel))
  )

# Define Category order and colors
ordered_categories <- c("Other", "Tax", "Clothes", "Food", "Rent")

category_colors <- c(
  Rent = "#161616",
  Food = "#7d6683",
  Clothes = "#b78a77",
  Tax = "#a9a09d",
  Other = "#bdb09f"
)

# Prepare income data for bar plot
income_clean <- income |>
  select(ClassLabel, Rent, Food, Clothes, Tax, Other) |>
  pivot_longer(cols = Rent:Other, names_to = "Category", values_to = "Percent") |>
  mutate(Category = factor(Category, levels = ordered_categories)) |>
  group_by(ClassLabel) |>
  mutate(pos = cumsum(Percent) - Percent / 2) |>
  ungroup()


# Plot
income_plot <- income_clean |>
  ggplot(aes(x = fct_rev(ClassLabel), y = Percent, fill = Category)) +
  geom_bar(
    stat = "identity", 
    position = "stack", 
    color = "black",
    width = 0.7
  ) +
  geom_text(
    data = income_clean |> filter(Percent > 1),
    aes(label = paste0(round(Percent, 1), "%"), y = pos),
    color = "white", 
    size = 2
  ) +
  scale_fill_manual(values = category_colors) +
  coord_flip(xlim = c(0, 7)) +
  scale_y_continuous(labels = NULL, expand = expansion(mult = c(0.1, 0.1))) +
  
  # Place Social Labels
  annotate(
    "text", x = 1, y = 102, 
    label = "Well-To-Do", 
    size = 2, angle = 90
  ) +
  annotate(
    "text", x = 2.5, y = 102, 
    label = "Comfortable", 
    size = 2,angle = 90
  ) +
  annotate(
    "text", x = 4.5, y = 102, 
    label = "Fair", 
    size = 2, angle = 90
  ) +
  annotate(
    "text", x = 6.5, y = 102, 
    label = "Poor", 
    size = 2, angle = 90
  ) +
  # Bottom text annotation
  annotate(
    "text", x = 0, y = 45, 
    label = "FOR FURTHER STATISTICS RAISE THIS FRAME.", 
    size = 2
  ) +
  labs(
    title = "INCOME AND EXPENDITURE OF 150 BLACK FAMILIES IN ATLANTA,G.A.,U.S.A.",
    subtitle = "Reproducing Du Bois",
    x = "",
    y = "",
    fill = "Annual Expenditure For"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    plot.title.position = "plot",
    plot.title = element_text(hjust = 0, size = 12, face = "bold"),
    axis.title.y = element_blank(),
    axis.text.y = element_text(face = "bold", size = 7),
    panel.grid = element_blank(),
    legend.position = "top",
    legend.title.position = "top",
    legend.title = element_text(hjust = 0.5),
  ) +
  guides(fill = guide_legend(reverse = TRUE))

# Load background image
income_plot <- ggbackground(income_plot, "images/parchment_paper.jpg")
print(income_plot)

2 - COVID survey - interpret

Example 1

For the questions, “Based on my understanding, I believe this vaccine is safe” and “I am concerned about my safety and side effects of the vaccine”, the wider error bars indicate higher variability in responses. This may suggest that there are widely differing perceptions of vaccine safety and potential side effects. Based on my intuition, this is a viable sentiment, this may reflect underlying differences in trust, prior knowledge, or exposure to misinformation.

Example 2

Participants that received the COVID and Flu vaccine expressed a similar sentiment of “feeling safer at work” and confidence of the COVID vaccine vetting process. This shared perspective may reflect institutional trust among dual-vaccinated individuals. In contrast, non-vaccinated participants express skepticism with less confidence in the vaccine development process and reduced trust in the institutions promoting vaccination. I agree with these results, this is highlighting a divergence in vaccines perception of credibility and safety.

Example 3

Nursing and Medical Participants returned surprising results for “feeling safer at work” and confidence of the COVID vaccine vetting process. Nurses showed low variation in their responses with narrower error bars, indicating a high level of trust. In contrast, medical participants exhibited a wider range of responses with larger error bars suggesting greater variability in confidence and perceived workplace safety. This divergence may point to differing experiences, levels of autonomy, or access to information between the two professional groups.

3 - COVID survey - reconstruct

# Load COVID Survey
survey <- read.csv("data/covid-survey.csv")

# Clean Names and Filter NAs
survey_clean <- survey |> 
  row_to_names(row_number = 1) |>
  clean_names() |>
  mutate(across(everything(), ~ na_if(., ""))) |> # empty strings to NA
  filter(if_any(-response_id, ~ !is.na(.)))       # Remove rows where all except 'response_id' are NA

survey_clean |>
  dim()
[1] 1111   14
# recode variables
survey_recoded <- survey_clean |>
  mutate(
    exp_already_vax = recode(
      exp_already_vax, "0" = "No", "1" = "Yes"),
    exp_flu_vax = recode(
      exp_flu_vax, "0" = "No", "1" = "Yes"),
    exp_profession = recode(
      exp_profession, "0" = "Medical", "1" = "Nursing"),
    exp_gender = recode(
      exp_gender, 
      "0" = "Male",
      "1" = "Female", 
      "3" = " Non-binary third gender", 
      "4" = "Prefer not to say"
      ),
    exp_race = recode(
      exp_race, 
      "1" = "American Indian / Alaskan Native", 
      "2" = "Asian", 
      "3" = "Black / African American", 
      "4" = "Native Hawaiian / Other Pacific Islander", 
      "5" = "White"
      ),
    exp_ethnicity = recode(
      exp_ethnicity, 
      "1" = "Hispanic / Latino", 
      "2" = "Non-Hispanic/Non-Latino"
      ),
    exp_age_bin = case_when(
      exp_age_bin == "0" ~ "<20",
      exp_age_bin == "20" ~ "20–25",
      exp_age_bin == "25" ~ "26–30",
      exp_age_bin == "30" ~ ">30",
      TRUE ~ NA_character_  # For NA values
      )
  )

survey_recoded |>
  dim()
[1] 1111   14
covid_survey_longer <- survey_recoded |>
  pivot_longer(
    cols = starts_with("exp_"),
    names_to = "explanatory",
    values_to = "explanatory_value"
  ) |>
  filter(!is.na(explanatory_value)) |>
  pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
  )

# create quantile values for high and low probabilities
covid_survey_summary_stats_by_group <- covid_survey_longer |>
  group_by(explanatory, explanatory_value, response) |>
  summarize(
    mean = mean(as.numeric(response_value), na.rm = TRUE),
    low  = quantile(as.numeric(response_value), probs = 0.1, na.rm = TRUE),
    high = quantile(as.numeric(response_value), probs = 0.9, na.rm = TRUE),
    .groups = "keep"
  ) 

covid_survey_summary_stats_by_group
# A tibble: 126 × 6
# Groups:   explanatory, explanatory_value, response [126]
   explanatory explanatory_value response        mean   low  high
   <chr>       <chr>             <chr>          <dbl> <dbl> <dbl>
 1 exp_age_bin 20–25             resp_concern_…  3.32     2     5
 2 exp_age_bin 20–25             resp_confiden…  1.31     1     2
 3 exp_age_bin 20–25             resp_feel_saf…  1.20     1     2
 4 exp_age_bin 20–25             resp_safety     1.95     1     5
 5 exp_age_bin 20–25             resp_trust_in…  1.29     1     2
 6 exp_age_bin 20–25             resp_will_rec…  1.09     1     1
 7 exp_age_bin 26–30             resp_concern_…  3.35     1     5
 8 exp_age_bin 26–30             resp_confiden…  1.40     1     2
 9 exp_age_bin 26–30             resp_feel_saf…  1.29     1     2
10 exp_age_bin 26–30             resp_safety     2.16     1     5
# ℹ 116 more rows
# introduce "All" as an explanatory row
covid_survey_summary_stats_all <- covid_survey_longer |>
  group_by(response) |>
  summarize(
    mean = mean(as.numeric(response_value), na.rm = TRUE),
    low  = quantile(as.numeric(response_value), probs = 0.1, na.rm = TRUE),
    high = quantile(as.numeric(response_value), probs = 0.9, na.rm = TRUE),
    explanatory = "All",
    explanatory_value = "",
    .groups = "drop"
  )

covid_survey_summary_stats_all
# A tibble: 6 × 6
  response         mean   low  high explanatory explanatory_value
  <chr>           <dbl> <dbl> <dbl> <chr>       <chr>            
1 resp_concern_s…  3.28     1     5 All         ""               
2 resp_confidenc…  1.43     1     2 All         ""               
3 resp_feel_safe…  1.36     1     2 All         ""               
4 resp_safety      2.03     1     5 All         ""               
5 resp_trust_info  1.40     1     2 All         ""               
6 resp_will_reco…  1.21     1     2 All         ""               
# Bind summary stats by rows
covid_survey_summary_stats <- bind_rows(
  covid_survey_summary_stats_by_group,
  covid_survey_summary_stats_all
)

covid_survey_summary_stats
# A tibble: 132 × 6
# Groups:   explanatory, explanatory_value, response [132]
   explanatory explanatory_value response        mean   low  high
   <chr>       <chr>             <chr>          <dbl> <dbl> <dbl>
 1 exp_age_bin 20–25             resp_concern_…  3.32     2     5
 2 exp_age_bin 20–25             resp_confiden…  1.31     1     2
 3 exp_age_bin 20–25             resp_feel_saf…  1.20     1     2
 4 exp_age_bin 20–25             resp_safety     1.95     1     5
 5 exp_age_bin 20–25             resp_trust_in…  1.29     1     2
 6 exp_age_bin 20–25             resp_will_rec…  1.09     1     1
 7 exp_age_bin 26–30             resp_concern_…  3.35     1     5
 8 exp_age_bin 26–30             resp_confiden…  1.40     1     2
 9 exp_age_bin 26–30             resp_feel_saf…  1.29     1     2
10 exp_age_bin 26–30             resp_safety     2.16     1     5
# ℹ 122 more rows
response_labels <- c( 
  "resp_safety" = 
    "Based on my understanding, I believe the vaccine is safe",
  "resp_feel_safe_at_work" = 
    "Getting the vaccine will make me feel safer at work",
  "resp_concern_safety" = 
    "I am concerned about the safety and side effects of the vaccine",
  "resp_confidence_science" = 
    "I am confident in the scientific vetting process for the new COVID vaccines",
  "resp_trust_info" = 
    "I trust the information that I have received about the vaccines",
  "resp_will_recommend" = 
    "I will recommend the vaccine to family, friends, and community members"
)

explanatory_labels <- c(
  "All" = "All",
  "exp_age_bin" = "Age",
  "exp_gender" = "Gender",
  "exp_race" = "Race",
  "exp_ethnicity" = "Ethnicity",
  "exp_profession" = "Profession",
  "exp_already_vax" = "Had COVID vaccine",
  "exp_flu_vax" = "Had flu vaccine this year"
)

covid_plot <- covid_survey_summary_stats |>
  # adjust feature and response names
  mutate(
    response = recode(response, !!!response_labels),
    explanatory = recode(explanatory, !!!explanatory_labels),
    explanatory = factor(explanatory, levels = explanatory_labels) # reorder
  ) |>

  ggplot(aes(y = fct_rev(explanatory_value))) +
  geom_errorbarh(aes(xmin = low, xmax = high), height = 0.4) +
  geom_point(aes(x = mean), color = "black", size = 2) +
  facet_grid(
    explanatory ~ response, 
    labeller = labeller(
      response = label_wrap_gen(15),
      explanatory = label_wrap_gen(15)), 
    scales = "free_y"
  ) +
  labs(
    title = NULL,
    x = "Mean Likert Scored\n(Error bars range from 10th to 90th percentile)",
    y = NULL
  ) +
  theme_minimal() +
  theme(
    strip.text.y = element_text(angle = 0, hjust = 0.5, size = 12), # Rotate "explanatory" labels
    strip.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 12, hjust = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "grey90")
  )

covid_plot

4 - COVID survey - re-reconstruct

# create quantile values for high and low probabilities
covid_survey_summary_stats_by_group_alt <- covid_survey_longer |>
  group_by(explanatory, explanatory_value, response) |>
  summarize(
    mean = mean(as.numeric(response_value), na.rm = TRUE),
    low  = quantile(as.numeric(response_value), probs = 0.25, na.rm = TRUE),
    high = quantile(as.numeric(response_value), probs = 0.75, na.rm = TRUE),
    .groups = "keep"
  ) 

# introduce "All" as an explanatory row
covid_survey_summary_stats_all_alt <- covid_survey_longer |>
  group_by(response) |>
  summarize(
    mean = mean(as.numeric(response_value), na.rm = TRUE),
    low  = quantile(as.numeric(response_value), probs = 0.25, na.rm = TRUE),
    high = quantile(as.numeric(response_value), probs = 0.75, na.rm = TRUE),
    explanatory = "All",
    explanatory_value = "",
    .groups = "drop"
  )

# Bind summary stats by rows
covid_survey_summary_stats_alt <- bind_rows(
  covid_survey_summary_stats_by_group_alt,
  covid_survey_summary_stats_all_alt
)                                       


covid_plot_alt <- covid_survey_summary_stats_alt |>
  # adjust feature and response names
  mutate(
    response = recode(response, !!!response_labels),
    explanatory = recode(explanatory, !!!explanatory_labels),
    explanatory = factor(explanatory, levels = explanatory_labels) # reorder
  ) |>

  ggplot(aes(y = fct_rev(explanatory_value))) +
  geom_errorbarh(aes(xmin = low, xmax = high), height = 0.4) +
  geom_point(aes(x = mean), color = "black", size = 2) +
  facet_grid(
    explanatory ~ response, 
    labeller = labeller(
      response = label_wrap_gen(15),
      explanatory = label_wrap_gen(15)), 
    scales = "free_y"
  ) +
  labs(
    title = NULL,
    x = "Mean Likert Scored\n(Error bars range from 25th to 75th percentile)",
    y = NULL
  ) +
  theme_minimal() +
  theme(
    strip.text.y = element_text(angle = 0, hjust = 0.5, size = 12), # Rotate "explanatory" labels
    strip.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 12, hjust = 1),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    strip.background = element_rect(fill = "grey90")
  )

covid_plot_alt

Response

When comparing the two graphs, the immediate difference is the smaller intervals because of a reduced percentile range. These percentiles are specified as probabilities between 0 and 1 in the quantile() function.

My view has changed based on the new plot, read the following:

For example 1, the error bars are more narrow for “Based on my understanding, I believe this vaccine is safe”. My conclusion changes, the majority of the error bars now span across the 1 and 2 response values. In contrast, the error bars for the original plot spans across all response values 1 through 5. The new plot portrays to the audience that participants are more certain the “vaccine” is safe.

No change, for example 2.

For example 3, similar to example 1, the error bars are more narrow and show less variability in the data. The answers among Nurses and Medical doesn’t indicate any sign of divergence in opinions. Based on the new plot, Nurses and Medical now show error bars that trend towards positive sentiment towards “trusting the vetting process” and “safer at work”.

5 - COVID survey - another view

# Response Labels for Legend and Fill
response_value_labels <- c(
  "1" = "Strongly Agree",
  "2" = "Somewhat Agree",
  "3" = "Neither Agree or Disagree",
  "4" = "Somewhat Disagree",
  "5" = "Strongly Disagree"
)

# calculate proportion for %100 of 'response'
covid_survey_summary_proportion <- covid_survey_longer |>
  filter(!is.na(response_value)) |>
  group_by(response, response_value) |>
  summarize(n = n(), .groups = "drop") |>
  group_by(response) |>
  mutate(prop = n / sum(n)) # cal. mean

# calculate proportions for Diverging chart
covid_survey_diverge_proportion <- covid_survey_summary_proportion |>
  # classify divergence
  mutate(
    response_value = factor(response_value, levels = c("5", "4", "3", "2", "1")),
    signed_prop = case_when(
      response_value %in% c("1", "2") ~ prop,
      TRUE ~ -prop
    )
  ) 

# Prepare labels for plot
covid_survey_diverge_proportion <- covid_survey_diverge_proportion |>
  # adjust feature response names and values
  mutate(
    response = recode(response, !!!response_labels),
    # wrap text
    response = str_wrap(response, width = 25)
  )
# Legend Configuration
likert_colors <- RColorBrewer::brewer.pal(5, "RdYlBu")
legend_breaks <- rev(c(
  "Strongly Agree",
  "Somewhat Agree",
  "Neither Agree or Disagree",
  "Somewhat Disagree",
  "Strongly Disagree"
))

covid_plot_100_percent <- covid_survey_diverge_proportion |>
  mutate(
    response_value = factor(response_value, levels = c("5", "4", "3", "1", "2")),
    response_value = recode(response_value, !!!response_value_labels)
  ) |>

  ggplot(aes(, x = signed_prop, y = response, fill = response_value)) +
  geom_bar(position='stack', stat='identity') +
  scale_x_continuous(
    limits = c(-.8, 1), 
    breaks = seq(-1, 1, by = 0.25),
    labels = scales::percent
  ) +
  scale_fill_manual(
    values = likert_colors,
    breaks = legend_breaks
  ) +
  labs(
    title = "Diverging Opinions in COVID Survey",
    subtitle = "Interpreting COVID survey using divergent bar", 
    x = "Proportion of Mean Response",
    y = NULL
  ) +
  theme_minimal() +
  theme(
    legend.position = "top",
    legend.title = element_blank(),
    legend.text = element_text(size = 6),
    axis.text.y = element_text(size = 6, hjust = 1),
    panel.grid.minor = element_blank(),
    panel.grid.major.y = element_blank()
  )

covid_plot_100_percent

Diverging bar chart showing proportions of Likert scale responses, from Strongly Disagree to Strongly Agree, across COVID-related survey questions. Bars extend left for negative sentiment and right for positive sentiment, this portrays the disparity of agreement levels across questions relating to participants trust of the COVID vaccine. This graph highlights participant's larger concern of safety and side effects of the COVID vaccine.

covid_plot_100_percent <- covid_survey_diverge_proportion |>
  mutate(
    response_value = recode(response_value, !!!response_value_labels)
  ) |>

  ggplot(aes(y = response, x = prop, fill = response_value)) +
  geom_bar(position='stack', stat='identity') +
  scale_x_continuous(labels = scales::percent) +
  scale_fill_brewer(palette = "RdYlBu") +
  labs(
    title = "Porportionaly Strong Opinions in COVID Survey",
    subtitle = "Intreptating the average response using a stacked bar chart", 
    x = "Proportion of Mean Response",
    y = NULL
  ) +
  theme_minimal() +
  guides(fill = guide_legend(reverse = TRUE)) +
  theme(
    legend.position = "top",
    legend.title = element_blank(),
    legend.text = element_text(size = 6),
    axis.text.y = element_text(size = 6, hjust = 1),
    panel.grid.major = element_blank()
  )

covid_plot_100_percent

Stacked bar chart showing proportions of Likert scale responses, from Strongly Agree to Strongly Disagree, across COVID-related survey questions, scaled from 0% to 100%. The chart highlights participants' difference of opinion concerning safety and side effects of the COVID vaccine.